# First go through and make sure all stations have Lat/Longs, or as many as possible. Delete secondary lat/longs
#read in the sample data, skipping first 3 lines of other headers. Format all times as hh:mm in Excel, paste into textwrangler if they need homogenization (i.e. multiple formats of times)
fish_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Fish/FISH_DATA_21Jul2017.xlsx",sheet="Samples")
fish_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Fish/FISH_DATA_21Jul2017.xlsx",sheet="Events")
fish_genetic<-read_excel(path="/Users/eric/google_drive/MarineGEO/Fish/FISH_DATA_21Jul2017.xlsx",sheet="Genetic samples")
#these commands apply to the original fish-group template
#remove columns without mapped DwC terms mapped
#fish_samples<-fish_samples[,-grep(pattern = "^X",x = names(fish_samples),perl=T)]
#fish_events<-fish_events[,-grep(pattern = "^X",x = names(fish_events),perl=T)]
#remove records without occurrenceIDs (temp before final dataset)
#fish_samples<-fish_samples[-which(is.na(fish_samples$occurrenceID)),]
#translate fieldIDs to eventIDs
#fish_samples$eventID<-gsub("LRP 17-","KANF0",fish_samples$eventID)
#use ddply to lump all materialSampleIDs into a single field, separated by a |
#fish_samples<-ddply(fish_samples, "occurrenceID", transform, materialSampleID = #paste(materialSampleID, collapse = "|"))
#keep only the first instance of each occurrenceID
#fish_samples<-fish_samples[!duplicated(fish_samples$occurrenceID),]
#change all empty columns from logical to character class
fish_events[sapply(fish_events, is.logical)] <- lapply(fish_events[sapply(fish_events, is.logical)], as.character)
#change eventID to just the KANF value
fish_events$eventID<-str_extract(fish_events$eventID,"KANF\\d\\d\\d")
fish_samples$eventID<-str_extract(fish_samples$eventID,"KANF\\d\\d\\d")
#round the lat/longs to four digits (~ 10m uncertainty)
fish_events$decimalLatitude<-round(as.numeric(fish_events$decimalLatitude),digits = 4)
fish_events$decimalLongitude<-round(as.numeric(fish_events$decimalLongitude),digits = 4)
fish_events$coordinateUncertaintyInMeters<-as.numeric(fish_events$coordinateUncertaintyInMeters)
# class(fish_events$geoReferenceProtocol)
# class(fish_events$maximumDepthInMeters)
# class(fish_events$minimumDepthInMeters)
# class(fish_events$recordedBy)
# class(fish_events$samplingProtocol) #some missing. ask Diane to fill this in
# class(fish_events$habitatBiotic)
#remove carriage returns from Geomorphological Zone
fish_events$habitatGeomorphologicalZone<-str_replace_all(fish_events$habitatGeomorphologicalZone,pattern="\r\n",replacement = "")
fish_events$habitatSubstrate<-str_replace_all(fish_events$habitatSubstrate,pattern="\r\n",replacement = "")
fish_events$year<-as.numeric(fish_events$year)
fish_events$month<-match(fish_events$month,month.abb)
fish_events$day<-as.numeric(fish_events$day)
#Recommended fields
# class(fish_events$eventRemarks)
# class(fish_events$locality)
# class(fish_events$eventRemarks)
# class(fish_events$verbatimCoordinates)
# class(fish_events$eventMedia)
#remove ending times from eventTime field
fish_events$eventTime<-str_extract(string=fish_events$eventTime,pattern="^\\d\\d:\\d\\d")
#change all empty columns from logical to character class
fish_samples[sapply(fish_samples, is.logical)] <- lapply(fish_samples[sapply(fish_samples, is.logical)], as.character)
fish_samples$otherCatalogNumbers<-NULL # drop this for now - it will be replaced by fish_biorep below
# class(fish_samples[,which(sapply(fish_samples, is.logical))])<-"character"
# class(fish_samples$occurrenceID)
# class(fish_samples$basisofRecord)
# class(fish_samples$catalogNumber)<-"character"
# class(fish_samples$organismScope)
# class(fish_samples$eventID)
# class(fish_samples$scientificName) #eventually parse this into taxon categories?
# class(fish_samples$taxonRank) #using this
# class(fish_samples$individualCount)
# class(fish_samples$institutionID)
# make identifiedBy go firstname lastname
fish_samples$identifiedBy<-str_replace(fish_samples$identifiedBy, pattern="(\\w+), (.+)", replacement="\\2 \\1")
fish_samples$catalogNumber<-as.character(fish_samples$catalogNumber)
colnames(fish_genetic)[5]<-"BiorepositoryID"
colnames(fish_genetic)[6]<-"tissueNotes"
colnames(fish_genetic)[2]<-"occurrenceID"
#use ddply to lump all materialSampleIDs into a single field, separated by a |
fish_biorep<-ddply(fish_genetic, "occurrenceID", transform, otherCatalogNumbers = paste(BiorepositoryID, collapse = "|"), tissueNotes = paste(tissueNotes, collapse="|"))
#keep only the first instance of each occurrenceID
fish_biorep<-fish_biorep[!duplicated(fish_biorep$otherCatalogNumbers),]
fish_biorep$otherCatalogNumbers<-as.character(fish_biorep$otherCatalogNumbers)
#first join the biorep numbers to this data
fish_samples<-left_join(fish_samples,fish_biorep[,c(2,6,7)],by="occurrenceID")
#now join samples and events
fish<-left_join(fish_samples,fish_events,by="eventID")
#optionally make it from the points provided
#bbox<-make_bbox(lon=fish2$decimalLongitude,lat=fish2$decimalLatitude)
#by individual
fish2<-fish %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(count=n())
#by species
fish3 <- fish %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())
#richness
fish4<-fish3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
fish6<-left_join(fish2,fish4)
fish_map<-ggmap(dmap) + geom_point(data = fish6, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=count, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Species Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank())
fish_map
ggsave(fish_map,filename="./output/fish_map.pdf")
#Add in KANI and KANM stations to events, edit eventIDs for capitalization
algae_samples<-read_excel("/Users/eric/google_drive/MarineGEO/algae/MarineGEOHI_bioassessment_master_KANA.xlsx", sheet="Sample")
algae_events<-read_excel("/Users/eric/google_drive/MarineGEO/algae/MarineGEOHI_bioassessment_master_KANA.xlsx", sheet="Station")
#change all empty columns from logical to character class
algae_events[sapply(algae_events, is.logical)] <- lapply(algae_events[sapply(algae_events, is.logical)], as.character)
#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
algae_events$decimalLatitude<-round(as.numeric(algae_events$decimalLatitude),digits = 4)
algae_events$decimalLongitude<-abs(round(as.numeric(algae_events$decimalLongitude),digits = 4))*-1
# class(algae_events$eventID)
# class(algae_events$geoReferenceProtocol) #good
# class(algae_events$coordinateUncertaintyInMeters) #good
# class(algae_events$maximumDepthInMeters) #some missing. ask Melinda to fill this in
# class(algae_events$minimumDepthInMeters)#some missing. ask Melinda to fill this in
# class(algae_events$recordedBy) #some missing. ask Melinda to fill this in
# class(algae_events$samplingProtocol) #some missing. ask Melinda to fill this in
# class(algae_events$`habitatBiotic`)
# class(algae_events$`habitatGeomorphologicalZone`)
# class(algae_events$`habitatSubstrate`)
#Recommended fields
# class(algae_events$eventRemarks)
# class(algae_events$locality)
# class(algae_events$eventRemarks)
# class(algae_events$verbatimCoordinates)
# class(algae_events$eventMedia)
#format time correctly
algae_events$eventTime<-as.character(parse_date_time(algae_events$eventTime, orders="ymdHMS", tz="HST"),format="%H:%M")
#change all empty columns from logical to character class
algae_samples[sapply(algae_samples, is.logical)] <- lapply(algae_samples[sapply(algae_samples, is.logical)], as.character)
# class(algae_samples$occurrenceID)
# class(algae_samples$catalogNumber)
# class(algae_samples$otherCatalogNumbers)
# class(algae_samples$organismScope)
# class(algae_samples$eventID)
# class(algae_samples$scientificName) #eventually parse this into taxon categories?
# class(algae_samples$taxonRank) # Melinda needs to populate this...
# class(algae_samples$identifiedBy)
# class(algae_samples$individualCount)
algae_samples$basisofRecord<-"specimen"
algae_samples$institutionID<-"USNM"
algae<-left_join(algae_samples,algae_events,by="eventID")
#by individual
algae2<-algae %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(count=n())
#by species
algae3 <- algae[-which(algae$scientificName=="?"),] %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())
#richness
algae4<-algae3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
algae5<-left_join(algae2,algae4)
#algae5<-algae5[which(is.na(algae5$decimalLatitude)),]
#algae5<-algae5[-which(algae5$scientificName=="?"),]
algae5[4,4]<-1
algae5[11,4]<-2
extra<-algae5[33,]
algae5<-algae5[-33,] #remove outlier
algae_map<-ggmap(dmap) + geom_point(data = algae5, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=count, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank()) + geom_point(data=extra, mapping=aes(x = decimalLongitude, y = decimalLatitude, size=30, color=30))
#add outlier back in as a red dot size=30
algae_map
ggsave(algae_map,filename="./output/algae_map.pdf")
invert_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Inverts/BKANE_063017_FIMS.xlsx", sheet = "Station", skip=2)
invert_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Inverts/BKANE_063017_FIMS.xlsx", sheet = "Specimen",skip=3)
#remove columns that don't appear in MarineGEO schema
invert_samples<-invert_samples[,-grep(pattern = "^X",x = names(invert_samples),perl=T)]
invert_events<-invert_events[,-grep(pattern = "^X",x = names(invert_events),perl=T)]
#remove records without occurrenceIDs (temp before final dataset)
invert_samples<-invert_samples[-which(is.na(invert_samples$scientificName)),]
This code will populate taxonRank with the lowest known taxonomic category
invert_samples$taxonRank[which(!is.na(invert_samples$Phylum))]<-"Phylum"
invert_samples$taxonRank[which(!is.na(invert_samples$Class))]<-"Class"
invert_samples$taxonRank[which(!is.na(invert_samples$Subclass))]<-"Subclass"
invert_samples$taxonRank[which(!is.na(invert_samples$Order))]<-"Order"
invert_samples$taxonRank[which(!is.na(invert_samples$Suborder))]<-"Suborder"
invert_samples$taxonRank[which(!is.na(invert_samples$Superfamily))]<-"Superfamily"
invert_samples$taxonRank[which(!is.na(invert_samples$Family))]<-"Family"
invert_samples$taxonRank[which(!is.na(invert_samples$Subfamily))]<-"Subfamily"
invert_samples$taxonRank[which(!is.na(invert_samples$Genus))]<-"Genus"
invert_samples$taxonRank[which(!is.na(invert_samples$Species))]<-"Species"
invert_samples$taxonRank[grep("sp\\.",invert_samples$Species)]<-"Genus"
# Note: invert data need a lot of cleaning, including importing stations from the algae and meiofauna team, and making new stations with "A" or "B" appended. Currently 839 samples do not have station information because of this.
#change all empty columns from logical to character class
invert_events[sapply(invert_events, is.logical)] <- lapply(invert_events[sapply(invert_events, is.logical)], as.character)
#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
invert_events$decimalLatitude<-round(as.numeric(invert_events$decimalLatitude),digits = 4)
invert_events$decimalLongitude<-abs(round(as.numeric(invert_events$decimalLongitude),digits = 4))*-1
invert_events$geoReferenceProtocol<-"GPS"
invert_events$coordinateUncertaintyInMeters<-100
# class(invert_events$maximumDepthInMeters) #some missing. ask John to fill this in
# class(invert_events$minimumDepthInMeters)#some missing. ask John to fill this in
#Format the date properly, get rid of the "raw" field
invert_events$day<-day(invert_events$eventDate)
invert_events$month<-month(invert_events$eventDate)
invert_events$year<-year(invert_events$eventDate)
invert_events$eventDate<-NULL
# class(invert_events$recordedBy) #good
# class(invert_events$samplingProtocol) #some missing. ask John to fill this in
# class(invert_events$habitatGeomorphologicalZone) #some missing. ask John to fill this in. #this needs to be aligned with the schema
# class(invert_events$habitatSubstrate) # some missing. ask John to fill this in. #this needs to be aligned with the schema
#Recommended fields
# class(invert_events$locality)
# class(invert_events$eventRemarks)
#change all empty columns from logical to character class
invert_samples[sapply(invert_samples, is.logical)] <- lapply(invert_samples[sapply(invert_samples, is.logical)], as.character)
# class(invert_samples$occurrenceID)
# class(invert_samples$scientificName) #eventually parse this into taxon categories?
# class(invert_samples$taxonRank) #using this
invert_samples$basisOfRecord<-"specimen"
invert_samples$organismScope<-"organism"
#remove eventIDs that have two possibilities, assume it is the first one for now
invert_samples$eventID<-gsub(pattern="(\\w) or \\w",replacement="\\1",x= invert_samples$eventID,perl=T)
invert_samples$identifiedBy<-"Paulay, Gustav" # for now. don't know if they kept this info
#assume that blank counts had 1 individual for now
invert_samples$individualCount[is.na(invert_samples$individualCount)]<-1
invert_samples$institutionID<-"FLMNH"
invert<-left_join(invert_samples,invert_events,by="eventID")
#by individual
invert2<-invert %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T))
#by species
invert3 <- invert %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())
#richness
invert4<-invert3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
invert5<-left_join(invert2,invert4)
invert_map<-ggmap(dmap) + geom_point(data = invert5[-c(61,62),], mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank())
invert_map
ggsave(invert_map,filename="./output/invert_map.pdf")
#note meio data still need some cleaning, including importing some stations from inverts and fish, and fixing up eventIDs to have 3 digits instead of 2. There are 27 specimens that do not have station information because of this.
meio_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Meiofauna/Hawaii2017_meiofauna_MarineGEO.xlsx", sheet = "station data", skip=1)
meio_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Meiofauna/Hawaii2017_meiofauna_MarineGEO.xlsx", sheet = "specimen data",skip=2)
#remove columns that don't appear in MarineGEO schema
meio_samples<-meio_samples[,-grep(pattern = "^X",x = names(meio_samples),perl=T)]
meio_events<-meio_events[,-grep(pattern = "^X",x = names(meio_events),perl=T)]
#add occurrence IDs to one investigators samples
meio_samples$occurrenceID[which(is.na(meio_samples$occurrenceID))]<-"UJ"
#make occurrence IDs unique
meio_samples$occurrenceID<-make.unique(meio_samples$occurrenceID, sep="_")
#change all empty columns from logical to character class
meio_events[sapply(meio_events, is.logical)] <- lapply(meio_events[sapply(meio_events, is.logical)], as.character)
#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
meio_events$decimalLatitude<-round(as.numeric(meio_events$decimalLatitude),digits = 4)
meio_events$decimalLongitude<-abs(round(as.numeric(meio_events$decimalLongitude),digits = 4))*-1
meio_events$coordinateUncertaintyInMeters<-100
# class(meio_events$maximumDepthInMeters)
# class(meio_events$minimumDepthInMeters)
# class(meio_events$recordedBy) #ask Freya to follow format - add last names, and pipes between names
# class(meio_events$samplingProtocol) #some missing. ask Freya to fill this in
# class(meio_events$habitatGeomorphologicalZone) #some missing. ask Freya to fill this in
# class(meio_events$habitatSubstrate) # some missing. ask Freya to fill this in
# class(meio_events$habitatBiotic)
#Recommended fields
# class(meio_events$locality)
# class(meio_events$eventRemarks)
meio_events$day<-day(meio_events$eventDate)
meio_events$month<-month(meio_events$eventDate)
meio_events$year<-year(meio_events$eventDate)
meio_events$eventDate<-NULL
meio_events$eventTime<-as.character(parse_date_time(meio_events$eventTime, orders="ymdHMS", tz="HST"),format="%H:%M")
meio_events$eventMedia<-"N"
#change all empty columns from logical to character class
meio_samples[sapply(meio_samples, is.logical)] <- lapply(meio_samples[sapply(meio_samples, is.logical)], as.character)
meio_samples$basisofRecord<-"specimen" #check with Frey that this is correct
meio_samples$institutionID<-"USNM"
# class(meio_samples$scientificName) #eventually parse this into taxon categories?
# class(meio_samples$taxonRank) #using this
# class(meio_samples$occurrenceID)
# class(meio_samples$catalogNumber)
# class(meio_samples$otherCatalogNumbers)
# class(meio_samples$organismScope)
# class(meio_samples$eventID)
# class(meio_samples$identifiedBy)
# class(meio_samples$individualCount)
meio<-left_join(meio_samples,meio_events,by="eventID")
#by individual
meio2<-meio %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T))
#by species
meio3 <- meio %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())
#richness
meio4<-meio3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
meio5<-left_join(meio2,meio4)
meio_map<-ggmap(dmap) + geom_point(data = meio5, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank())
meio_map
ggsave(meio_map,filename="./output/meio_map.pdf")
arms_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/ARMS/MarineGEOHI_bioassessment_master-ARMS.xlsx", sheet = "Station")
arms_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/ARMS/MarineGEOHI_bioassessment_master-ARMS.xlsx", sheet = "Sample")
#change all empty columns from logical to character class
arms_events[sapply(arms_events, is.logical)] <- lapply(arms_events[sapply(arms_events, is.logical)], as.character)
#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
arms_events$decimalLatitude<-round(as.numeric(arms_events$decimalLatitude),digits = 4)
arms_events$decimalLongitude<-abs(round(as.numeric(arms_events$decimalLongitude),digits = 4))*-1
arms_events$coordinateUncertaintyInMeters<-10
arms_events$minimumDepthInMeters<-as.numeric(arms_events$minimumDepthInMeters)
# class(arms_events$geoReferenceProtocol)
# class(arms_events$eventID)
# class(arms_events$maximumDepthInMeters)
# class(arms_events$recordedBy) #ask Laetitia to follow name format
# class(arms_events$samplingProtocol)
# class(arms_events$habitatGeomorphologicalZone)
# class(arms_events$habitatSubstrate)
# class(arms_events$habitatBiotic)
#Recommended fields
# class(arms_events$locality)
# class(arms_events$year)
# class(arms_events$month)
# class(arms_events$day)
arms_events$eventMedia<-"Y" #still need to get these from Laetitia
#change all empty columns from logical to character class
arms_samples[sapply(arms_samples, is.logical)] <- lapply(arms_samples[sapply(arms_samples, is.logical)], as.character)
# class(arms_samples$occurrenceID)
# class(arms_samples$basisofRecord)
# class(arms_samples$catalogNumber)
# class(arms_samples$otherCatalogNumbers)
# class(arms_samples$organismScope) #were there some slurries too?
# class(arms_samples$eventID)
# class(arms_samples$scientificName) #eventually parse this into taxon categories?
# class(arms_samples$taxonRank) #using this
# class(arms_samples$identifiedBy)
# class(arms_samples$individualCount)
# change NAs for individualCount to 1 for now
arms_samples$individualCount[which(is.na(arms_samples$individualCount))]<-1
arms_samples$institutionID<-"USNM"
arms<-left_join(arms_samples,arms_events,by="eventID")
#by individual
arms2<-arms %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T))
#by species
arms3 <- arms %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())
#richness
arms4<-arms3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
arms5<-left_join(arms2,arms4)
arms_map<-ggmap(dmap) + geom_point(data = arms5, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank())
arms_map
ggsave(arms_map,filename="./output/arms_map.pdf")
trans_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/bioassessment_KANV.xlsx", sheet = "Station")
trans_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/bioassessment_KANV.xlsx", sheet = "Sample")
trans_event_photos<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/KANV_benthic-photos_filenames_20170711.xlsx", sheet = "Sheet1")
trans_organism_photos<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/KANV_photo-vouchers_20170712.xlsx", sheet = "Sheet1")
# need to revisit this to see if I can parse out Zach's and Ross' initials to keep their observations separate
# event photos
#pop off the eventID into its own field
trans_event_photos$eventID<-sub(pattern="(KANV\\d\\d\\d)_.+",replacement = "\\1", trans_event_photos$eventMedia, perl=T)
#use ddply to lump all eventMedia into a single field, separated by a |
trans_eventMedia<-ddply(trans_event_photos, "eventID", transform, eventMedia = paste(eventMedia, collapse = "|"))
#keep only the first instance of each occurrenceID
trans_eventMedia<-trans_eventMedia[!duplicated(trans_eventMedia$eventID),]
trans_events<-left_join(trans_events, trans_eventMedia, by="eventID")
# organism photos
## delete the field to be added later
trans_samples$associatedMedia<-"NULL"
#pop off the eventID again into its own field
trans_organism_photos$eventID<-sub(pattern="(KANV\\d\\d\\d)_.+",replacement = "\\1", trans_organism_photos$associatedMedia, perl=T)
#pop off the species name into its own field
trans_organism_photos$scientificName<-sub(pattern=".+_(\\d{8})_(\\w+-[a-z]+)_.+", replacement = "\\2", trans_organism_photos$associatedMedia,perl=T)
trans_organism_photos$scientificName<-sub(pattern="-", replacement=" ",x = trans_organism_photos$scientificName)
#pop off the initials of the collector, and replace with full name
trans_organism_photos$identifiedBy<-str_extract(pattern="ZF|RW", string=trans_organism_photos$associatedMedia)
trans_organism_photos$identifiedBy[which(trans_organism_photos$identifiedBy=="ZF")]<-"Zach Foltz"
trans_organism_photos$identifiedBy[which(trans_organism_photos$identifiedBy=="RW")]<-"Ross Whippo"
#use ddply to lump all associatedMedia into a single field, separated by a |
trans_associatedMedia<-ddply(trans_organism_photos, c("eventID","scientificName","identifiedBy"), transform, associatedMedia = paste(associatedMedia, collapse = "|"))
#remove all but the first instance
trans_associatedMedia<-trans_associatedMedia[!duplicated(trans_associatedMedia$associatedMedia),]
trans_samples<-left_join(trans_samples, trans_associatedMedia, by=c("eventID","scientificName","identifiedBy"))
#change all empty columns from logical to character class
trans_events[sapply(trans_events, is.logical)] <- lapply(trans_events[sapply(trans_events, is.logical)], as.character)
#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
trans_events$decimalLatitude<-round(as.numeric(trans_events$decimalLatitude),digits = 4)
trans_events$decimalLongitude<-abs(round(as.numeric(trans_events$decimalLongitude),digits = 4))*-1
trans_events$minimumDepthInMeters<-as.numeric(trans_events$minimumDepthInMeters)
# class(trans_events$eventID)
# class(trans_events$coordinateUncertaintyInMeters)
# class(trans_events$maximumDepthInMeters)
# class(trans_events$recordedBy)
# class(trans_events$samplingProtocol)
# class(trans_events$habitatGeomorphologicalZone)
# class(trans_events$habitatSubstrate)
# class(trans_events$habitatBiotic)
#Recommended fields
# class(trans_events$locality)
# class(trans_events$year)
# class(trans_events$month)
# class(trans_events$day)
trans_events$eventMedia<-"Y"
trans_events$eventTime<-as.character(parse_date_time(trans_events$eventTime, orders="ymdHMS", tz="HST"),format="%H:%M")
#change all empty columns from logical to character class
trans_samples[sapply(trans_samples, is.logical)] <- lapply(trans_samples[sapply(trans_samples, is.logical)], as.character)
# class(trans_samples$occurrenceID)
# class(trans_samples$basisofRecord)
# class(trans_samples$catalogNumber)
# class(trans_samples$otherCatalogNumbers)
# class(trans_samples$organismScope)
# class(trans_samples$eventID)
# class(trans_samples$scientificName) #eventually parse this into taxon categories?
# class(trans_samples$taxonRank) #using this
# class(trans_samples$identifiedBy)
# class(trans_samples$individualCount)
trans_samples$institutionID<-"USNM"
trans<-left_join(trans_samples,trans_events,by="eventID")
#by individual
trans2<-trans %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T))
#by species
trans3 <- trans %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n(), individualCount=sum(individualCount,na.rm=T))
#richness
trans4<-trans3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
trans5<-left_join(trans2,trans4)
trans_map<-ggmap(dmap) + geom_point(data = trans5, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Species Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank())
trans_map
ggsave(trans_map,filename="./output/trans_map.pdf")
Now to join everything into one monster database
a<-full_join(fish,algae)
b<-full_join(a,invert)
c<-full_join(b,meio)
d<-full_join(c,arms)
MarineGEOHI<-full_join(d,trans)
#read in the flat schema
flat_schema<-as.vector(read.csv("flat_schema.csv", header=F, stringsAsFactors = F))
#reduce the joined database to just the columns in the schema, and sort on column order
MarineGEOHI2<-MarineGEOHI[,match(flat_schema, names(MarineGEOHI))]
write.csv(MarineGEOHI2,"./output/MarineGEOHI_data_1.0.csv",row.names = F)